/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

syntax = "proto3";

package xla.memory_space_assignment;

// Memory space assignment options for slicing prefetches into smaller
// asynchronous copies, reducing prefetch memory allocation pressure.
//
// No prefetch slicing is performed if max_slices == 0.
//
// TODO(b/275905276): Consider adding another option that indicates that we want
// slices of a certain size, rather than just always creating max_slices.
message SlicedPrefetchOptions {
  // The maximum number of slices into which to slice a prefetch.
  uint32 max_slices = 1;

  // The minimum tensor size in bytes that we will attempt to slice.
  uint64 min_bytes = 2;

  // This option should never be set to true in production. When this is true,
  // we will crash if we propose a slice (other than the final slice) with a
  // size that is not a multiple of the required hardware alignment. Otherwise,
  // we will choose not to slice such situations, which is always safe.
  bool fail_on_non_alignment_boundary_slice_proposal = 3;
}

// Options for memory-bound loop optimizations in memory space assignment. If
// enabled, this pass can optimize memory-bound unrolled loops to maximize the
// bandwidth utilized and minimize the execution time.
message MemoryBoundLoopOptimizerOptions {
  // Enable the memory-bound loop optimizations.
  optional bool enabled = 1;

  // The desired ratio of overlapped operations that is sufficient to overlap
  // prefetches with. If this value is 1, the algorithm will try to fully
  // overlap the prefetches with other compute, if less than 1, the algorithm
  // may schedule prefetches such that some of the prefetch is not overlapped,
  // so may become critical. For example, if this value is 0.5, we are willing
  // for the prefetch time to take up to 2X of the overlapped computation time.
  optional float desired_copy_ratio = 2;

  // If true, the algorithm allows a fully pipelined prefetch to be scheduled
  // even if the copy resources haven't reached the desired copy ratio. A fully
  // pipelined prefetch starts the same time as its counterpart in the previous
  // iteration finishes.
  optional bool allow_unsatisfied_fully_pipelined_prefetch = 3;
}
